Read in and clean data
# ss is csv file with school shooting data (individual incidents)
ss = read_csv("ss_data.csv")%>%
select(-X1)%>%
mutate(year = as.numeric(substr(date, (nchar(date)-3),nchar(date))))%>%
mutate(state = ifelse(state == "St. Croix, US Virgin Islands","St.Croix",state))%>%
distinct_at(.vars = c(1:24), .keep_all = T)
Population data
# data set to go from state names to abbreviations
states = data.frame(state.name,state.abb)
dc = data.frame(matrix(ncol= 2, nrow = 1))
colnames(dc)=colnames(states)
dc$`state.name` = "District of Columbia"
dc$`state.abb` = "DC"
states = rbind(states,dc)
states$state.name = as.character(states$state.name)
if(import_raw_data){
# 1980-1990
pop8090_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt" )%>%data.frame()
pop8090_raw = pop8090_raw[c(8:58,64:114),]%>%data.frame()
colnames(pop8090_raw)="var"
pop8090 = data.frame(pop8090_raw$var[1:51],pop8090_raw$var[52:102])
colnames(pop8090)=c("var1","var2")
pop8090 = pop8090%>%
separate(var1, c("state",as.character(seq(1980,1984,by = 1))), extra = "merge")%>%
separate(var2, c("state1",as.character(seq(1985,1990,by = 1))), extra = "merge")%>%
select(c(state,starts_with("19")))
# 1970 - 1980
pop7080_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st7080ts.txt")
pop7080_raw = pop7080_raw[c(10:61,63:114),]%>%data.frame()
colnames(pop7080_raw)="var"
pop7080 = data.frame(pop7080_raw$var[1:51],pop7080_raw$var[53:103])
colnames(pop7080)=c("var1","var2")
pop7080 = pop7080%>%
separate(var1, c("blnk","id","state","1970","1971","1972","1973","1974","1975"), extra = "merge")%>%
separate(var2, c("blnk1","id1","state1",as.character(seq(1976,1980,by = 1))), extra = "merge")%>%
select(c(state,starts_with("19")))
# 1990-2000
pop90_00 = read_csv("../../data/pop_90-00.csv")%>%
rename(state = X1)%>%
select(c(state,3:13))%>%
filter(state != "USA")
colnames(pop90_00)[2:ncol(pop90_00)]=as.character(seq(1990,2000,by = 1))
pop90_00$state[30:35] = states$state.name[29:34]
pop90_00$state[40:42] = states$state.name[39:41]
pop90_00$state[9] = states$state.name[51]
pop90_00$state[49] = states$state.name[48]
# 2000-2010
pop00_10 = read.csv("https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/st-est00int-agesex.csv")%>%
clean_names()%>%
filter(sex == 0, age == 999)%>%
select(c(name,starts_with("popestimate")))%>%
rename_at(vars(starts_with("popestimate")),funs(substr(.,start = 12, stop = 15)))%>%
rename(state = name)%>%
mutate(state = as.character(state))%>%
mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))%>%
filter(state != "United States")
# 2010-2018
pop10_18 = read_csv("../../data/pop_2010-2018.csv")
colnames(pop10_18)=pop10_18[1,]
pop10_18 = pop10_18%>%
slice(-1)%>%
rename_at(vars(starts_with("Pop")), funs(substr(.,start = 38,stop = 41)))%>%
rename(state = Geography)%>%
select(c(state,starts_with("20")))%>%
mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))
# 2019
pop2019 = read_csv("../../data/2019pop.csv")%>%
clean_names()%>%
select(c(state,pop))%>%
rename("2019"=pop)
pops = left_join(pop7080,pop8090%>%select(-c("1980")), by = "state")%>%
left_join(states,by = c("state"="state.abb"))%>%
mutate(state = state.name)%>%
select(-state.name)%>%
left_join(pop90_00%>%select(-c("1990")), by = "state")%>%
left_join(pop00_10%>%select(-c("2000")), by = "state")%>%
left_join(pop10_18%>%select(-c("2010")), by = "state")%>%
left_join(pop2019,by = "state")%>%
gather(key = "year", value = "population", c(2:51))
write.csv(pops, "state_populations.csv")
}
Read in population data
# read in population data (include abbreviated and full state name)
pops = read_csv("state_populations.csv")%>%
select(-X1)%>%
left_join(states, by = c("state"="state.name"))
Incident count format
sts = unique(ss$state)
yrs = seq(range(ss$year)[1],range(ss$year)[2],by=1)
st_yr = data.frame(state = rep(sts,length(yrs)), year = sort(rep(yrs,length(sts))))%>%
arrange(state)
## data set aggrgated by state and year
ss_counts = ss%>%
group_by(state,year) %>%
summarize(incident_count = n(),
total_victims = sum(total_injured_killed_victims),
total_fatalities = sum(killed_includes_shooter),
total_wounded = sum(wounded),
avg_victims = mean(total_injured_killed_victims),
avg_fatalities = mean(killed_includes_shooter),
ave_wounded = mean(wounded),
avg_shooter_age = mean(shooter_age),
max_shooter_age = max(shooter_age),
min_shooter_age = min(shooter_age),
avg_reliability = mean(reliability_score_1_5),
target = mean(ifelse(targeted_specific_victim_s=="Y",1,
ifelse(targeted_specific_victim_s=="N",0,NA)),na.rm = T),
random_vict = mean(ifelse(random_victims=="Y",1,
ifelse(random_victims=="N",0,NA)),na.rm = T),
bullied = mean(ifelse(bullied_y_n_n_a=="Y",1,
ifelse(bullied_y_n_n_a=="N",0,NA)),na.rm = T),
domestic_violence = mean(ifelse(domestic_violence_y_n=="Y",1,
ifelse(domestic_violence_y_n=="N",0,NA)),na.rm = T))%>%
mutate(in_ss = TRUE)%>%
full_join(st_yr, by = c("state","year"))%>%
left_join(pops%>%rename(full_state_name = state), by = c("state"="state.abb", "year"))%>%
mutate(incident_count = ifelse(is.na(incident_count),0,incident_count))%>%
mutate(in_ss = ifelse(is.na(in_ss),FALSE,in_ss))
Grade data
# read in state grade data
if(import_raw_data){
grades18 = read_csv("../../data/2018grades.csv")%>%
select(-c(X6,X7))%>%
clean_names()%>%
select(-gun_death_rate_per_100k)%>%
mutate(year = "2018")%>%
rename(grade = x2018_grade)
grades17 = read_csv("../../data/2017grades.csv")%>%
clean_names()%>%
mutate(year = "2017")%>%
rename(grade = x2017_grade)
grades16 = read_csv("../../data/2016grades.csv")%>%
clean_names()%>%
mutate(year = "2016")%>%
rename(grade = x2016_grade)
grades15 = read_csv("../../data/2015grades.csv")%>%
clean_names()%>%
mutate(year = "2015")%>%
rename(grade = x2015_grade)
grades14 = read_csv("../../data/2014grades.csv")%>%
clean_names()%>%
mutate(year = "2014")%>%
rename(grade = x2014_grade)
grades13 = read_csv("../../data/2013grades.csv")%>%
clean_names()%>%
mutate(year = "2013")%>%
rename(grade = x2013_grade)
grades12 = read_csv("../../data/2012grades.csv")%>%
clean_names()%>%
mutate(year = "2012")%>%
rename(grade = x2012_grade)
# combine all year data
grades = bind_rows(grades18,grades17)%>%
bind_rows(grades16)%>%
bind_rows(grades15)%>%
bind_rows(grades14)%>%
bind_rows(grades13)%>%
bind_rows(grades12)
write.csv(grades,"state_grades.csv")
}
Read in grade data
grades = read_csv("state_grades.csv")%>%
select(-X1)
gpa_convert = data.frame(letter_grade = c("A","A-","B+","B","B-",
"C+","C","C-","D+","D","D-","F"),
gpa = c(4,3.7,3.3,3,2.7,2.3,2,1.7,1.3,1,0.7,0))
ss_counts = ss_counts%>%
left_join(grades%>%select(state,grade,year),by = c("full_state_name"="state","year"))%>%
left_join(gpa_convert, by = c("grade"="letter_grade"))
Poverty data
pov=read.csv("census_poverty_data.csv",stringsAsFactors = FALSE)
pov=filter(pov,State!=" United States"&State!=" United States"& State!="District of Columbia"&State!="United States")
names(pov)[1:2]<-c("state","poverty")
names(pov)[4]<-"year"
skel=expand_grid(sort(unique(pov$state)),min(pov$year):(max(pov$year)+2))
names(skel)<-c("state","year")
sk1=filter(skel,year==2007|year==2008|year==2009)
sk1=left_join(sk1,filter(pov,year==2007),by="state")
sk2=filter(skel,year==2010|year==2011|year==2012)
sk2=left_join(sk2,filter(pov,year==2010),by="state")
sk3=filter(skel,year==2013|year==2014|year==2015)
sk3=left_join(sk3,filter(pov,year==2013),by="state")
sk4=filter(skel,year==2016|year==2017|year==2018)
sk4=left_join(sk4,filter(pov,year==2016),by="state")
pov=rbind(sk1,sk2,sk3,sk4)
pov=pov[,1:3]
names(pov)<-c("state","year","poverty")
Merge poverty data
pov = left_join(pov,states, by = c("state"="state.name"))%>%
select(-state)%>%
rename(state = state.abb)
ss_counts = left_join(ss_counts, pov, by = c("state","year"))
Bullying data
bully = read_csv("State_bullying.csv")%>%
select(c(total_score,state.abb))%>%
rename(state = state.abb)%>%
mutate(year = 2010)
ss_counts = left_join(ss_counts, bully%>%select(-year), by = "state")%>%
rename(bullying_score = total_score)%>%
mutate(bullying_score = as.numeric(bullying_score))
Mental Health data
mh.data=read.csv("mh.data.csv",stringsAsFactors = FALSE)%>%
clean_names()%>%
select(-x)%>%
left_join(states, by = c("state"="state.name"))%>%
select(-state)%>%
rename(state = state.abb)
ss_counts = ss_counts%>%
left_join(mh.data, by = c("state","year"))%>%
rename(mh_score = percent)
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
# Rand Laws Data
data.rand=read.csv("rand.laws.database.csv",stringsAsFactors = FALSE)
CDC Bullying Data
qns = c("qn13","qn14","qn15","qn16","qn17","qn18","qn19","qn20","qn24","qn25","qn26","qn27","qn28","qn29","qn30","qnbullyweight","qnbullygay")
sts = yrbss::getListOfParticipatingStates()
yrs = c(1991,1993,1995,1997,1999,2001,2003,2005,2007,2009,2011,2013,2015)
if(import_raw_data){
bullying = data.frame(variable = NULL, state = NULL, year = NULL, prop= NULL, ciLB= NULL, ciUB = NULL, n = NULL)
for(v in qns){
for(s in sts){
for(y in yrs ){
p = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$prop
ciL = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciLB
ciU = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciUB
sample_size = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$n
newrow = data.frame(variable = v, state = s, year = y, prop= p, ciLB= ciL, ciUB = ciU, n = sample_size)
bullying = bind_rows(bullying,newrow)}
}}
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
bullying = left_join(bullying,lab, by = c(variable = "question"))
bullying$state[which(bullying$state=="AZB")]="AZ"
write_csv(bullying, "bullying_survey.csv")
}
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
# read in csv
bullying_survey = read_csv("bullying_survey.csv")
## Parsed with column specification:
## cols(
## variable = col_character(),
## state = col_character(),
## year = col_double(),
## prop = col_double(),
## ciLB = col_double(),
## ciUB = col_double(),
## n = col_double(),
## label = col_character()
## )
bullying_to_merge = read_csv("bullying_survey.csv")%>%
select(-c(ciLB,ciUB,n,label))%>%
gather(key = stat_type, value = value, prop)%>%
spread(variable, value)%>%
select(-stat_type)
## Parsed with column specification:
## cols(
## variable = col_character(),
## state = col_character(),
## year = col_double(),
## prop = col_double(),
## ciLB = col_double(),
## ciUB = col_double(),
## n = col_double(),
## label = col_character()
## )
colnames(bullying_to_merge)[3:17]=as.character(lab$label[1:15])
bullying_to_merge = bullying_to_merge%>%
clean_names()
ss_counts = left_join(ss_counts, bullying_to_merge, by = c("state","year"))
# check missingness for 2009-2015
sum(!complete.cases(filter(bullying_survey, year>=2009)))/nrow(filter(bullying_survey, year>=2009))
## [1] 0.2949126
### We're going to need to impute values for missing years and states for some of these variables if these data are going to be useful in our analysis
Make combined variables for survey questions
# combine suicide questions
suicide = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
suicide_qns = bullying_survey%>%
filter(variable %in% c("qn26","qn27","qn28","qn29","qn30"))
for(y in yrs){
for(st in sts){
dat = filter(suicide_qns,(year ==y & state == st))
suicide_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = suicide_score, n = samp)
suicide = bind_rows(suicide,newrow)
}
}
# combine questions involving physical fights
fight = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
fight_qns = bullying_survey%>%
filter(variable %in% c("qn18","qn19","qn20"))
for(y in yrs){
for(st in sts){
dat = filter(fight_qns,(year ==y & state == st))
fight_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = fight_score, n = samp)
fight = bind_rows(fight,newrow)
}
}
# school safety questions
safety = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
safety_qns = bullying_survey%>%
filter(variable %in% c("qn13","qn14","qn15","qn16","qn17"))
for(y in yrs){
for(st in sts){
dat = filter(safety_qns,(year ==y & state == st))
safety_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = safety_score, n = samp)
safety = bind_rows(safety,newrow)
}
}
# bullying questions
bull = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
bully_qns = bullying_survey%>%
filter(variable %in% c("qn24","qn25","qnbullyweight","qnbullygay"))
for(y in yrs){
for(st in sts){
dat = filter(bully_qns,(year ==y & state == st))
bully_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = bully_score, n = samp)
bull = bind_rows(bull,newrow)
}
}
survey_combined = bind_cols(suicide,safety, fight, bull)%>%
select(year, state, score, n, score1, n1, score2, n2, score3, n3)%>%
rename(suicide_score = score, school_safety_score = score1, fight_score = score2, bully_score = score3)%>%
filter(year>=2009)%>%
full_join(st_yr%>%filter(year>=2009))%>%
arrange(year)%>%
arrange(state)%>%
mutate(state = ifelse(state == "AZB","AZ",state))
survey_combined = survey_combined%>%
group_by(state)%>%
mutate(suicide_score =ifelse(year<2016,ifelse(is.na(suicide_score), na.locf(suicide_score)/2+na.locf(suicide_score, fromLast = TRUE)/2,suicide_score),
NA ))%>%
mutate(school_safety_score =ifelse(year<2016,ifelse(is.na(school_safety_score), na.locf(school_safety_score)/2+na.locf(school_safety_score, fromLast = TRUE)/2,school_safety_score),NA))%>%
mutate(fight_score =ifelse(year<2016,ifelse(is.na(fight_score), na.locf(fight_score)/2+na.locf(fight_score, fromLast = TRUE)/2,fight_score),NA))%>%
mutate(bully_score =ifelse(year<2016,ifelse(is.na(bully_score), na.locf(bully_score)/2+na.locf(bully_score, fromLast = TRUE)/2,bully_score),NA))%>%
mutate(n=ifelse(year<2016,ifelse(is.na(n), na.locf(n)/2+na.locf(n, fromLast = TRUE)/2,n),NA))%>%
mutate(n1=ifelse(year<2016,ifelse(is.na(n1), na.locf(n1)/2+na.locf(n1, fromLast = TRUE)/2,n1),NA))%>%
mutate(n2=ifelse(year<2016,ifelse(is.na(n2), na.locf(n2)/2+na.locf(n2, fromLast = TRUE)/2,n2),NA))%>%
mutate(n3=ifelse(year<2016,ifelse(is.na(n3), na.locf(n3)/2+na.locf(n3, fromLast = TRUE)/2,n3),NA))
# use fitted line to get 2016-2019 values
for(s in sts[which(!(sts %in% c("MA","AZB")))]){
dat = filter(survey_combined, state ==s)
ind = which(is.na(dat$suicide_score))[1]
slp = dat[ind-1,3:10]-dat[ind-2,3:10]
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
survey_combined[which(survey_combined$state == s),]=dat
}
EDA
table(ss$reliability_score_1_5)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(fill = Var1), stat = "identity")+
scale_fill_discrete(name = "Reliability Score")+
labs(x = "Reliability Score", y = "Count")

table(ss$school_type)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(fill = Var1), stat = "identity")+
scale_fill_discrete(name = "School Type")+
labs(x = "School Type", y = "Count")

ss%>%
filter(shooter_age<200)%>%
ggplot(aes(x = reorder(state, shooter_age, FUN = mean), y = shooter_age))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Age of shooter", x = "State", title = "Age of Shooter by State")

ss%>%
ggplot(aes(x = reorder(state, killed_includes_shooter, FUN = mean), y = killed_includes_shooter))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Numbeer Killed (including shooter)", x = "State", title = "Fatalities")

ss%>%
ggplot(aes(x = reorder(state, wounded, FUN = mean), y = total_injured_killed_victims))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Wounded ", x = "State", title = "Number Wounded")

ss%>%
ggplot(aes(x = reorder(state, total_injured_killed_victims, FUN = mean), y = total_injured_killed_victims))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Wounded or Killed", x = "State", title = "Combined Wounded & Fatalities")

table(ss$state)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(x = reorder(Var1,Freq, desc)), stat = "identity")+
labs(x = "State", y = "Count", "Number of incidents by State (overall)")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

Grade plots
cc = scales::seq_gradient_pal("green", "red", "Lab")(seq(0,1,length.out=12))
# This will handle the ordering
d <- ss_counts %>%
filter(year %in% seq(2012,2018,by=1))%>%
ungroup() %>% # As a precaution / handle in a separate .grouped_df method
arrange(year, incident_count) %>% # arrange by facet variables and continuous values
mutate(.r = row_number()) # Add a row number variable
ggplot(d, aes(x = .r, y= incident_count, fill = grade)) + # Use .r instead of x
geom_point(data = d%>%filter(incident_count==0),
aes(x = .r, y= incident_count, color = grade),
size = 0.7) +
geom_bar(stat = "identity")+
facet_wrap(~ year, scales = "free_x") + # Should free scales (though could be left to user)
scale_x_continuous( # This handles replacement of .r for x
breaks = d$.r, # notice need to reuse data frame
labels = d$state
)+
scale_fill_manual(values = cc, name = "Grade")+
scale_color_manual(values = cc, guide = FALSE)+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(0.7,.15), legend.direction = "horizontal",legend.key.size = unit(1.5,"line"), legend.text = element_text(size = 8), legend.title = element_text(size = 10))

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
filter(!is.na(grade))%>%
ggplot(aes(x = incident_count , fill = grade))+
geom_histogram(binwidth = 1)+
facet_wrap(.~year)+
scale_fill_manual(values = cc)+
scale_x_continuous(breaks = seq(0,35,by=5))+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5))+
theme_bw()+
labs(x= "Number of School Shootings")

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
filter(!is.na(grade))%>%
ggplot(aes(x = grade, y = incident_count, color = grade))+
geom_boxplot()+
facet_wrap(.~year)+
theme_bw()+
scale_color_manual(values = cc)+
labs(y= "Number of School Shootings", x = "Grade")

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
ggplot()+
geom_bar(aes(x = reorder(state,population), y = incident_count), stat = "identity")+
geom_point(aes(x = state, y = log(population), color = "population"), size = .4)+
theme_bw()+
labs(x = "State", y = "Number of School Shootings", title = "Number of School Shooting and Population by State")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(.7, 0), legend.justification = c(1, 0))+
facet_wrap(.~year)+
scale_color_discrete(name = NULL, labels = "Population (log)")

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
group_by(state) %>%
mutate(ordr = mean(avg_victims, na.rm = T))%>%
filter(ordr!="NaN")%>%
ungroup()%>%
arrange(desc(ordr))%>%
ggplot(aes(x = reorder(state,ordr), y = avg_victims, color = year))+
geom_point(size = 0.6, position = "jitter")+
labs(x = "State", y = "Average number of victims per incident", title ="Victims per School Shooting")+
scale_color_viridis(option = "D")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

Media Plots
media%>%
ggplot(aes(x = prev_month_art, y = mass_shooting, color = as.numeric(year)))+geom_point(position = "jitter")+
scale_color_continuous(name = "Year")+
labs(y = "Mass Shooting Count", x = "Articles in Previous Month")+
theme_bw()

media%>%
ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations, x= as.numeric(year), color = as.numeric(year)))+
geom_point()+
geom_jitter()+
labs(y = "Monthly Articles",x = "Year")+
theme(legend.position = "none")+
theme_bw()

media%>%
ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations/yearly_shootings, x= as.numeric(year), color = as.numeric(year)))+
geom_point()+
geom_jitter()+
labs(y = "Monthly Articles/Yearly Mass Shootings",x = "Year")+
theme(legend.position = "none")+
theme_bw()

media%>%
ggplot(aes(y = yearly_articles, x= as.numeric(year)))+
geom_point()+
labs(y = "Yearly Articles",x = "Year")+
theme_bw()

school_counts = ss%>%
select(school,city,state)%>%
unite(col = school_city,c(school,city,state), sep = "_")%>%
table(dnn = c("school","count"))%>%
data.frame()%>%
separate(col = school_city, into = c("school","city","state"), sep = "_" )
delta = data.frame(state = NULL, year = NULL, delta_gpa = NULL, delta_ss = NULL, delta_pop = NULL)
for(x in unique(ss_counts$state)){
dat = filter(ss_counts, state == x)%>%
arrange(year)
chng_gpa = c(NA,diff(dat$gpa))
chng_ss = c(NA, diff(dat$incident_count))
chng_pop = c(NA, diff(dat$population))
st = rep(x, nrow(dat))
newrows =data.frame(state = st, year = dat$year, delta_gpa = chng_gpa, delta_ss = chng_ss, delta_pop= chng_pop)
delta = bind_rows(delta,newrows)
}
ss_counts%>%
group_by(year)%>%
summarise(yearly_incidents = sum(incident_count), prev_year_art = prev_year_art[1] )%>%
ggplot(aes(x = year, y = yearly_incidents, fill = prev_year_art))+geom_bar(stat = "identity")+
labs(y = "Yearly School Shootings", x = "Year")+
scale_fill_viridis(option = "D", name = "Articles in previous year")+
theme_bw()

Bullying survey plots
ggplot(data= bullying_survey%>%filter(year>=2013), aes(x = year, y = prop, grouby_by = state, color = variable))+
geom_line(position = "jitter")+
scale_color_viridis_d()+
theme_bw()

ggplot(data= bullying_survey%>%filter(variable %in% qns[9:17]), aes(x = year, y = prop, color = variable))+
geom_point(position = "jitter")+
geom_smooth(se=F)+
scale_color_viridis_d()+
theme_bw()

lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
kableExtra::kable(lab, byrow = F, caption = "Questions topics")
Questions topics
|
question
|
label
|
|
qn13
|
Carried a weapon
|
|
qn14
|
Carried a gun
|
|
qn15
|
Carried a weapon on school property
|
|
qn16
|
Did not go to school because they felt unsafe at school or on their way to or from school
|
|
qn17
|
Were threatened or injured with a weapon on school property
|
|
qn18
|
Were in a physical fight
|
|
qn19
|
Were injured in a physical fight
|
|
qn20
|
Were in a physical fight on school property
|
|
qn24
|
Were bullied on school property
|
|
qn25
|
Were electronically bullied
|
|
qn26
|
Felt sad or hopeless
|
|
qn27
|
Seriously considered attempting suicide
|
|
qn28
|
Made a plan about how they would attempt suicide
|
|
qn29
|
Attempted suicide
|
|
qn30
|
Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse
|
|
qnbullyweight
|
Been teased b/c of weight past 12 mos
|
|
qnbullygay
|
Been teased b/c labeled GLB past 12 mos
|
bullying_survey%>%
slice(which(complete.cases(bullying_survey)))%>%
select(label)%>%
table()%>%
data.frame()%>%
kableExtra::kable(caption = "Observations per Question")
Observations per Question
|
.
|
Freq
|
|
Attempted suicide
|
298
|
|
Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse
|
282
|
|
Been teased b/c labeled GLB past 12 mos
|
18
|
|
Been teased b/c of weight past 12 mos
|
18
|
|
Carried a gun
|
231
|
|
Carried a weapon
|
275
|
|
Carried a weapon on school property
|
292
|
|
Did not go to school because they felt unsafe at school or on their way to or from school
|
298
|
|
Felt sad or hopeless
|
251
|
|
Made a plan about how they would attempt suicide
|
291
|
|
Seriously considered attempting suicide
|
308
|
|
Were bullied on school property
|
124
|
|
Were electronically bullied
|
94
|
|
Were in a physical fight
|
298
|
|
Were in a physical fight on school property
|
294
|
|
Were injured in a physical fight
|
270
|
|
Were threatened or injured with a weapon on school property
|
291
|
Model Fitting
fit = glm(incident_count~gpa+poverty+bullying_score+mh_score+ prev_year_art+year+offset(log(population)), family = "poisson", data = ss_counts)
if(FALSE){
ts_y = ts(ss_counts%>%slice(which(complete.cases(ss_counts)))%>%select(incident_count))
ts_x = ss_counts%>%
ungroup()%>%
slice(which(complete.cases(ss_counts)))%>%
select(c(gpa,poverty,bullying_score,yearly_articles,population))%>%
mutate(population = log(population))%>%
as.matrix()
ts_fit = tsglm(ts_y, xreg = ts_x, model = list(past_obs = 1, external = TRUE), link = "log", distr = "poisson")
best_fit = forecast::auto.arima(diff(ts_y), xreg = diff(ts_x))
astsa::sarima(diff(ts_y),p = 0, d = 0, q =0, xreg = diff(ts_x))
arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef
sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
cilb= arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef-sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
ciub = arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef+sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
data.frame(Lower= cilb, Upper = ciub)
}